########################################################################
# DIAGNOSTIC TOOL
# to detect influential observations
# /!\ the assumed underlying distribution (used to compute the cutoffs) is normal 
########################################################################

# Files needed to run this script:
# - MCV_consist
# - consistMCD
# - consistS
# - fastS_consistency
# - fastS_normality


#-------------------- Main function

DiagTool<-function(data, estim="MCD", bdp=0.25, alpha=0.05){
  # INPUT
  # data : data matrix
  # estim : choice of location and scatter 
  #     'class' : sample estimate
  #     'MCD'   : MCD estimate (raw)
  #     'RMCD'  : one-step reweighted MCD estimate
  #     'S'     : S estimate with Tukey's biweight function
  # bdp : breakdown point for the MCD, RMCD or S estimators
  # alpha : value for the upper and lower quantiles
  # OUTPUT
  # EIF_R : empirical influence measures for R MCV
  # EIF_VN : empirical influence measures for VN MCV
  # cov : scatter estimate (consistent)
  # mean: location estimate (consistent)
  # low_R : indexes of observations with low EIF for R
  # hi_R : indexes of observations with high EIF for R
  # low_VN : indexes of observations with low EIF for VN
  # hi_VN : indexes of observations with high EIF for VN

  data<-as.matrix(data)
  n<- nrow(data)
  p<-ncol(data)
  if(n<p) stop("the sample size needs to be larger than the dimension")

  dist="norm"
  df=0
  
  #------------- Parameter estimates
  if(estim=="class"){
    mean<- apply(data,2,mean)
    cov<- ((n-1)/n ) *cov(data)
  }else if(estim=="MCD" || estim=="RMCD"){
    # initial MCD
      MCDraw<-consistMCD(data=data, dist=dist, df=df,bdp=bdp)
      mean<- MCDraw$center
      cov<-MCDraw$cov
    # reweighted MCD
    if(estim=="RMCD"){
      RewMCD<-consistRMCD(data=data,moy0=mean, cov0=cov, dist=dist, df=df)
      mean<- RewMCD$center
      cov<-RewMCD$cov
    }
  }else if(estim=="S"){
    Sestim<- consistS(data=data,dist=dist,df=df,bdp=bdp)
    mean<- Sestim$center
    cov<- Sestim$cov
  }else stop("This is not  valid estim value. The possible values are: 'class', 'MCD', 'RMCD' or 'S'")
  
  #------------ Empirical influence measures
    EIF_VN<-apply(data,1,IFVNClassical,mu=mean,sigma=cov)	
    EIF_R<-apply(data,1,IFReyClassical,mu=mean,sigma=cov)	
    
  #------------ Cutoff computation
    cutoffRhi<- as.numeric(cutoff_EIFRey(mu=mean,sigma=cov, alpha=1-alpha/2))
    cutoffRlow<- as.numeric(cutoff_EIFRey(mu=mean,sigma=cov, alpha=alpha/2))
    cutoffVNhi<- as.numeric(cutoff_EIFVN(mu=mean,sigma=cov, alpha=1-alpha/2))
    cutoffVNlow<- as.numeric(cutoff_EIFVN(mu=mean,sigma=cov, alpha=alpha/2))
   
  #------------ Determination of low and high influential points
    low_R <-which(EIF_R< cutoffRlow)
    hi_R <- which(EIF_R> cutoffRhi)
    low_VN <- which(EIF_VN < cutoffVNlow)
    hi_VN <- which(EIF_VN > cutoffVNhi)
    
  #----------- Plots 
    # EIF_R only
    pch_vec<-rep(1,n)
    pch_vec[low_R]<- 17
    pch_vec[hi_R]<- 15
    plot_EIFR<- plot(EIF_R, pch=pch_vec, ylab="Emp. influence measures R")
    abline(h=cutoffRlow,col="red")
    abline(h=cutoffRhi, col="red")
    add_legend("topleft", legend=c("Regular", "Low influential", " High influential"), pch=c(1,17,15), 
               horiz=F, bty='n', cex=0.8)
    
    # EIF_VN only
    pch_vec<-rep(1,n)
    pch_vec[low_VN]<- 17
    pch_vec[hi_VN]<- 15
    plot_EIFVN<- plot(EIF_VN, pch=pch_vec, ylab="Emp. influence measures VN")
    abline(h=cutoffVNlow,col="red")
    abline(h=cutoffVNhi, col="red")
    add_legend("topleft", legend=c("Regular", "Low influential", " High influential"), pch=c(1,17,15), 
               horiz=F, bty='n', cex=0.8)
  
    # EIF_R vs EIF_VN
    pch_vec<-rep(1,n)
    pch_vec[low_R ]<- 17
    pch_vec[c(intersect(hi_R,hi_VN),intersect(low_R,low_VN))]<- 3
    pch_vec[c(setdiff(hi_R,intersect(hi_R,hi_VN)),setdiff(low_R,intersect(low_R,low_VN)))]<- 2
    pch_vec[c(setdiff(hi_VN,intersect(hi_R,hi_VN)),setdiff(low_VN,intersect(low_R,low_VN)))]<- 4
    plot_EIFRVN<- plot(EIF_R,EIF_VN, pch=pch_vec, ylab="Empirical IF (V&N)" , xlab="Empirical IF (R)")
    abline(v=cutoffRlow,col="red")
    abline(v=cutoffRhi, col="red")
    abline(h=cutoffRlow,col="red")
    abline(h=cutoffRhi, col="red")
    add_legend("topleft", legend=c("Regular", "Influential for R and V&N", "Influential for R only", "Influential for V&N only"),
               pch=c(1,2,3,4), 
               horiz=F, bty='n', cex=0.8)
    
    return(list(EIF_R=EIF_R,EIF_VN=EIF_VN, cov=cov, mean=mean,
                low_R=low_R,hi_R=hi_R,low_VN=low_VN, hi_VN=hi_VN, 
                 cutoffRlow=cutoffRlow,cutoffRhi=cutoffRhi,
                 cutoffVNlow=cutoffVNlow, cutoffVNhi=cutoffVNhi,
                plot_EIFR=plot_EIFR, plot_EIFVN=plot_EIFVN,
                plot_EIFRVN))
 }   

    
#---------------------- Auxiliary functions
cutoff_EIFVN<-function(mu,sigma,alpha){
  p<- length(mu)
  cv<-cvvoinov(mu,sigma)
  qalpha<-qchisq(alpha, df=1,ncp= cv^2 )
  cutoff<- (cv/2)* ( qalpha - (1+cv^2))
  return(cutoff)
}
cutoff_EIFRey<-function(mu,sigma,alpha){
  p<- length(mu)
  cv<-cvreyment(mu,sigma)
  ncp<-(p^2/(t(mu)%*% mu)^2)*(t(mu)%*% sigma%*% mu)
  qalpha<-qchisq(alpha, df=p,ncp= ncp)
  cutoff<- (cv/2)* ( qalpha/p - ncp/p -1 )
  return(cutoff)
}
DistMahala<-function(x,mu,sigma){
  return((x-mu)%*%solve(sigma) %*% (x-mu))
}


IFVNClassical<-function(x,mu,sigma){
  # Direct computation of the IF of the multivariate Voinov and Nikulin CV
  # based on the sample estimators (mean vector and covariance matrix)
  # under any model for which the true mean vector is mu and the true
  # covariance matrix is sigma; x is a p-dimensional vector
  p<-length(mu)
  cv<-cvvoinov(mu,sigma)
  t1<-((x-mu)%*% solve(sigma)%*% mu %*% mu %*% solve(sigma) %*% (x-mu))/(mu%*% solve(sigma) %*% mu)
  t2<- -2*(t(mu) %*% solve(sigma)%*% (x-mu))/(t(mu) %*% solve(sigma) %*%  mu)
  return((t1+t2-1)*(cv/2))
}

IFReyClassical<-function(x,mu,sigma){
  # Direct computation of the IF of the multivariate Reyment CV
  # based on the ML estimators (mean vector and covariance matrix)
  # under any model for which the true mean vector is mu and the true
  # covariance matrix is sigma; x is a p-dimensional vector
  p<-length(mu)
  cv<-cvreyment(mu,sigma)
  t1<-(t(x-mu) %*% solve(sigma) %*% (x-mu))/p
  t2<- -2*(t(mu) %*% x)/(t(mu) %*% mu)
  return((t1+1+t2)*(cv/2))
}


add_legend <- function(...) {
  opar <- par(fig=c(0, 1, 0, 1), oma=c(0, 0, 0, 0), 
              mar=c(0, 0, 0, 0), new=TRUE)
  on.exit(par(opar))
  plot(0, 0, type='n', bty='n', xaxt='n', yaxt='n')
  legend(...)
}